PART II: German Credit Score Classification Model Explainability

By: Krishna J

Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn               as sns
import matplotlib.pyplot     as plt
import shap
import eli5
from sklearn.model_selection import train_test_split
#from sklearn.ensemble        import RandomForestClassifier
#from sklearn.linear_model    import LogisticRegression
from sklearn.preprocessing   import MinMaxScaler, StandardScaler
from sklearn.base            import TransformerMixin
from sklearn.pipeline        import Pipeline, FeatureUnion
from typing                  import List, Union, Dict
# Warnings will be used to silence various model warnings for tidier output
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
np.random.seed(0)
pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.

Importing the source dataset

In [2]:
german_xai=pd.read_csv('C:/Users/krish/Downloads/German-mapped.csv')

Converting categorical fields to numerical fields

In [3]:
german_xai=pd.get_dummies(german_xai,columns=['CreditHistory','Purpose','Savings','EmployDuration','Debtors','Collateral','OtherPayBackPlan','Property','Job'])
german_xai.head()
Out[3]:
NumMonths CreditAmount PayBackPercent Gender ResidenceDuration Age ExistingCredit Dependents Telephone Foreignworker ... OtherPayBackPlan_bank OtherPayBackPlan_none OtherPayBackPlan_stores Property_free Property_own Property_rent Job_management/self-emp/officer/highly qualif emp Job_skilled employee Job_unemp/unskilled-non resident Job_unskilled-resident
0 6 1169 4 1 4 1 2 1 1 1 ... 0 1 0 0 1 0 0 1 0 0
1 48 5951 2 0 2 0 1 1 0 1 ... 0 1 0 0 1 0 0 1 0 0
2 12 2096 2 1 3 1 1 2 0 1 ... 0 1 0 0 1 0 0 0 0 1
3 42 7882 2 1 4 1 1 2 0 1 ... 0 1 0 1 0 0 0 1 0 0
4 24 4870 3 1 4 1 2 2 0 1 ... 0 1 0 1 0 0 0 1 0 0

5 rows × 50 columns

In [4]:
german_xai.columns
Out[4]:
Index(['NumMonths', 'CreditAmount', 'PayBackPercent', 'Gender',
       'ResidenceDuration', 'Age', 'ExistingCredit', 'Dependents', 'Telephone',
       'Foreignworker', 'Marital_Status', 'CreditStatus',
       'CreditHistory_Delay', 'CreditHistory_none/paid', 'CreditHistory_other',
       'Purpose_CarNew', 'Purpose_CarUsed', 'Purpose_biz',
       'Purpose_domestic app', 'Purpose_education', 'Purpose_furniture/equip',
       'Purpose_others', 'Purpose_radio/tv', 'Purpose_repairs',
       'Purpose_retraining', 'Savings_500+', 'Savings_<500', 'Savings_none',
       'EmployDuration_1-4 yr', 'EmployDuration_4-7 yr',
       'EmployDuration_<1 yr', 'EmployDuration_>=7 yr',
       'EmployDuration_unemployed', 'Debtors_co-applicant',
       'Debtors_guarantor', 'Debtors_none', 'Collateral_car/other',
       'Collateral_real estate', 'Collateral_savings/life insurance',
       'Collateral_unknown/none', 'OtherPayBackPlan_bank',
       'OtherPayBackPlan_none', 'OtherPayBackPlan_stores', 'Property_free',
       'Property_own', 'Property_rent',
       'Job_management/self-emp/officer/highly qualif emp',
       'Job_skilled employee', 'Job_unemp/unskilled-non resident',
       'Job_unskilled-resident'],
      dtype='object')
In [5]:
german_xai = german_xai.reindex(columns=['NumMonths', 'CreditAmount', 'PayBackPercent', 'Gender',
       'ResidenceDuration', 'Age', 'ExistingCredit', 'Dependents', 'Telephone',
       'Foreignworker', 'Marital_Status',
       'CreditHistory_Delay', 'CreditHistory_none/paid', 'CreditHistory_other',
       'Purpose_CarNew', 'Purpose_CarUsed', 'Purpose_biz',
       'Purpose_domestic app', 'Purpose_education', 'Purpose_furniture/equip',
       'Purpose_others', 'Purpose_radio/tv', 'Purpose_repairs',
       'Purpose_retraining', 'Savings_500+', 'Savings_<500', 'Savings_none',
       'EmployDuration_1-4 yr', 'EmployDuration_4-7 yr',
       'EmployDuration_<1 yr', 'EmployDuration_>=7 yr',
       'EmployDuration_unemployed', 'Debtors_co-applicant',
       'Debtors_guarantor', 'Debtors_none', 'Collateral_car/other',
       'Collateral_real estate', 'Collateral_savings/life insurance',
       'Collateral_unknown/none', 'OtherPayBackPlan_bank',
       'OtherPayBackPlan_none', 'OtherPayBackPlan_stores', 'Property_free',
       'Property_own', 'Property_rent',
       'Job_management/self-emp/officer/highly qualif emp',
       'Job_skilled employee', 'Job_unemp/unskilled-non resident',
       'Job_unskilled-resident', 'CreditStatus'])
german_xai.head()
Out[5]:
NumMonths CreditAmount PayBackPercent Gender ResidenceDuration Age ExistingCredit Dependents Telephone Foreignworker ... OtherPayBackPlan_none OtherPayBackPlan_stores Property_free Property_own Property_rent Job_management/self-emp/officer/highly qualif emp Job_skilled employee Job_unemp/unskilled-non resident Job_unskilled-resident CreditStatus
0 6 1169 4 1 4 1 2 1 1 1 ... 1 0 0 1 0 0 1 0 0 1
1 48 5951 2 0 2 0 1 1 0 1 ... 1 0 0 1 0 0 1 0 0 0
2 12 2096 2 1 3 1 1 2 0 1 ... 1 0 0 1 0 0 0 0 1 1
3 42 7882 2 1 4 1 1 2 0 1 ... 1 0 1 0 0 0 1 0 0 1
4 24 4870 3 1 4 1 2 2 0 1 ... 1 0 1 0 0 0 1 0 0 0

5 rows × 50 columns

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
german_xai[['CreditAmount']]=scaler.fit_transform(german_xai[['CreditAmount']])

Writing data to csv file

In [7]:
german_xai.to_csv('C:/Users/krish/Downloads/German-encoded.csv', index=False)

Splitting into train and test data

In [8]:
X = german_xai.iloc[:, :-1]
y = german_xai['CreditStatus']
X.head()
y.head()
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=40,stratify=y)
Out[8]:
NumMonths CreditAmount PayBackPercent Gender ResidenceDuration Age ExistingCredit Dependents Telephone Foreignworker ... OtherPayBackPlan_bank OtherPayBackPlan_none OtherPayBackPlan_stores Property_free Property_own Property_rent Job_management/self-emp/officer/highly qualif emp Job_skilled employee Job_unemp/unskilled-non resident Job_unskilled-resident
0 6 0.050567 4 1 4 1 2 1 1 1 ... 0 1 0 0 1 0 0 1 0 0
1 48 0.313690 2 0 2 0 1 1 0 1 ... 0 1 0 0 1 0 0 1 0 0
2 12 0.101574 2 1 3 1 1 2 0 1 ... 0 1 0 0 1 0 0 0 0 1
3 42 0.419941 2 1 4 1 1 2 0 1 ... 0 1 0 1 0 0 0 1 0 0
4 24 0.254209 3 1 4 1 2 2 0 1 ... 0 1 0 1 0 0 0 1 0 0

5 rows × 49 columns

Out[8]:
0    1
1    0
2    1
3    1
4    0
Name: CreditStatus, dtype: int64
In [9]:
german_xai.dtypes
Out[9]:
NumMonths                                              int64
CreditAmount                                         float64
PayBackPercent                                         int64
Gender                                                 int64
ResidenceDuration                                      int64
Age                                                    int64
ExistingCredit                                         int64
Dependents                                             int64
Telephone                                              int64
Foreignworker                                          int64
Marital_Status                                         int64
CreditHistory_Delay                                    uint8
CreditHistory_none/paid                                uint8
CreditHistory_other                                    uint8
Purpose_CarNew                                         uint8
Purpose_CarUsed                                        uint8
Purpose_biz                                            uint8
Purpose_domestic app                                   uint8
Purpose_education                                      uint8
Purpose_furniture/equip                                uint8
Purpose_others                                         uint8
Purpose_radio/tv                                       uint8
Purpose_repairs                                        uint8
Purpose_retraining                                     uint8
Savings_500+                                           uint8
Savings_<500                                           uint8
Savings_none                                           uint8
EmployDuration_1-4 yr                                  uint8
EmployDuration_4-7 yr                                  uint8
EmployDuration_<1 yr                                   uint8
EmployDuration_>=7 yr                                  uint8
EmployDuration_unemployed                              uint8
Debtors_co-applicant                                   uint8
Debtors_guarantor                                      uint8
Debtors_none                                           uint8
Collateral_car/other                                   uint8
Collateral_real estate                                 uint8
Collateral_savings/life insurance                      uint8
Collateral_unknown/none                                uint8
OtherPayBackPlan_bank                                  uint8
OtherPayBackPlan_none                                  uint8
OtherPayBackPlan_stores                                uint8
Property_free                                          uint8
Property_own                                           uint8
Property_rent                                          uint8
Job_management/self-emp/officer/highly qualif emp      uint8
Job_skilled employee                                   uint8
Job_unemp/unskilled-non resident                       uint8
Job_unskilled-resident                                 uint8
CreditStatus                                           int64
dtype: object
In [10]:
import klib
klib.missingval_plot(X)
klib.missingval_plot(y)
No missing values found in the dataset.
No missing values found in the dataset.

Feature Selection

1. Using Mutual info classif

In [11]:
from sklearn.feature_selection import mutual_info_classif
mutual_info=mutual_info_classif(X_train, y_train,random_state=40)
mutual_info
Out[11]:
array([0.06019707, 0.02108839, 0.00223861, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00353083, 0.02491331,
       0.        , 0.        , 0.00457068, 0.00844984, 0.        ,
       0.        , 0.01332007, 0.        , 0.        , 0.01102085,
       0.02957691, 0.        , 0.        , 0.00160791, 0.03429891,
       0.0457149 , 0.        , 0.        , 0.00819407, 0.00311788,
       0.02706385, 0.        , 0.        , 0.00205918, 0.0261869 ,
       0.        , 0.00280856, 0.00092909, 0.03215574, 0.02567867,
       0.00608715, 0.00376107, 0.        , 0.00802985, 0.        ,
       0.        , 0.        , 0.        , 0.        ])

Estimate mutual information for a discrete target variable.

Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.

In [12]:
X.columns
Out[12]:
Index(['NumMonths', 'CreditAmount', 'PayBackPercent', 'Gender',
       'ResidenceDuration', 'Age', 'ExistingCredit', 'Dependents', 'Telephone',
       'Foreignworker', 'Marital_Status', 'CreditHistory_Delay',
       'CreditHistory_none/paid', 'CreditHistory_other', 'Purpose_CarNew',
       'Purpose_CarUsed', 'Purpose_biz', 'Purpose_domestic app',
       'Purpose_education', 'Purpose_furniture/equip', 'Purpose_others',
       'Purpose_radio/tv', 'Purpose_repairs', 'Purpose_retraining',
       'Savings_500+', 'Savings_<500', 'Savings_none', 'EmployDuration_1-4 yr',
       'EmployDuration_4-7 yr', 'EmployDuration_<1 yr',
       'EmployDuration_>=7 yr', 'EmployDuration_unemployed',
       'Debtors_co-applicant', 'Debtors_guarantor', 'Debtors_none',
       'Collateral_car/other', 'Collateral_real estate',
       'Collateral_savings/life insurance', 'Collateral_unknown/none',
       'OtherPayBackPlan_bank', 'OtherPayBackPlan_none',
       'OtherPayBackPlan_stores', 'Property_free', 'Property_own',
       'Property_rent', 'Job_management/self-emp/officer/highly qualif emp',
       'Job_skilled employee', 'Job_unemp/unskilled-non resident',
       'Job_unskilled-resident'],
      dtype='object')
In [13]:
mutual_info=pd.Series(mutual_info)
mutual_info.index=X_train.columns
mutual_info.sort_values(ascending=False)
Out[13]:
NumMonths                                            0.060197
Savings_<500                                         0.045715
Savings_500+                                         0.034299
Collateral_unknown/none                              0.032156
Purpose_others                                       0.029577
EmployDuration_>=7 yr                                0.027064
Debtors_none                                         0.026187
OtherPayBackPlan_bank                                0.025679
Foreignworker                                        0.024913
CreditAmount                                         0.021088
Purpose_biz                                          0.013320
Purpose_furniture/equip                              0.011021
CreditHistory_other                                  0.008450
EmployDuration_4-7 yr                                0.008194
Property_own                                         0.008030
OtherPayBackPlan_none                                0.006087
CreditHistory_none/paid                              0.004571
OtherPayBackPlan_stores                              0.003761
Telephone                                            0.003531
EmployDuration_<1 yr                                 0.003118
Collateral_real estate                               0.002809
PayBackPercent                                       0.002239
Debtors_guarantor                                    0.002059
Purpose_retraining                                   0.001608
Collateral_savings/life insurance                    0.000929
Property_free                                        0.000000
Property_rent                                        0.000000
Job_management/self-emp/officer/highly qualif emp    0.000000
Collateral_car/other                                 0.000000
Job_skilled employee                                 0.000000
Job_unemp/unskilled-non resident                     0.000000
Gender                                               0.000000
Savings_none                                         0.000000
Debtors_co-applicant                                 0.000000
EmployDuration_unemployed                            0.000000
EmployDuration_1-4 yr                                0.000000
ResidenceDuration                                    0.000000
Purpose_repairs                                      0.000000
Purpose_radio/tv                                     0.000000
Purpose_education                                    0.000000
Purpose_domestic app                                 0.000000
Purpose_CarUsed                                      0.000000
Purpose_CarNew                                       0.000000
CreditHistory_Delay                                  0.000000
Marital_Status                                       0.000000
Dependents                                           0.000000
ExistingCredit                                       0.000000
Age                                                  0.000000
Job_unskilled-resident                               0.000000
dtype: float64
In [14]:
mutual_info.sort_values(ascending=False).plot.bar(figsize=(15,5))
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x2837e56bf88>

Selecting top 10 features having highest dependencies w.r.to target variable CreditStatus

In [15]:
mutual_info.sort_values(ascending=False)[0:22]
Out[15]:
NumMonths                  0.060197
Savings_<500               0.045715
Savings_500+               0.034299
Collateral_unknown/none    0.032156
Purpose_others             0.029577
EmployDuration_>=7 yr      0.027064
Debtors_none               0.026187
OtherPayBackPlan_bank      0.025679
Foreignworker              0.024913
CreditAmount               0.021088
Purpose_biz                0.013320
Purpose_furniture/equip    0.011021
CreditHistory_other        0.008450
EmployDuration_4-7 yr      0.008194
Property_own               0.008030
OtherPayBackPlan_none      0.006087
CreditHistory_none/paid    0.004571
OtherPayBackPlan_stores    0.003761
Telephone                  0.003531
EmployDuration_<1 yr       0.003118
Collateral_real estate     0.002809
PayBackPercent             0.002239
dtype: float64
In [16]:
german_xai_imp=german_xai[['Gender','Age','Marital_Status','NumMonths','Savings_<500','Savings_none','Dependents','Property_rent',
                           'Job_management/self-emp/officer/highly qualif emp','Debtors_guarantor','Purpose_CarNew',
                           'Purpose_furniture/equip','CreditHistory_none/paid','Purpose_CarUsed','CreditAmount',
                           'Collateral_real estate','Debtors_none','Job_unemp/unskilled-non resident','Purpose_others',             
                            'CreditHistory_other','PayBackPercent','Collateral_unknown/none','Purpose_education', 'CreditStatus']]
german_xai_imp.head()
Out[16]:
Gender Age Marital_Status NumMonths Savings_<500 Savings_none Dependents Property_rent Job_management/self-emp/officer/highly qualif emp Debtors_guarantor ... CreditAmount Collateral_real estate Debtors_none Job_unemp/unskilled-non resident Purpose_others CreditHistory_other PayBackPercent Collateral_unknown/none Purpose_education CreditStatus
0 1 1 1 6 0 1 1 0 0 0 ... 0.050567 1 1 0 0 1 4 0 0 1
1 0 0 0 48 1 0 1 0 0 0 ... 0.313690 1 1 0 0 0 2 0 0 0
2 1 1 1 12 1 0 2 0 0 0 ... 0.101574 1 1 0 0 1 2 0 1 1
3 1 1 1 42 1 0 2 0 0 1 ... 0.419941 0 0 0 0 0 2 0 0 1
4 1 1 1 24 1 0 2 0 0 0 ... 0.254209 0 1 0 0 0 3 1 0 0

5 rows × 24 columns

2. Using correlation

In [17]:
corrMatrix = round(german_xai_imp.corr(),1)
corrMatrix
Out[17]:
Gender Age Marital_Status NumMonths Savings_<500 Savings_none Dependents Property_rent Job_management/self-emp/officer/highly qualif emp Debtors_guarantor ... CreditAmount Collateral_real estate Debtors_none Job_unemp/unskilled-non resident Purpose_others CreditHistory_other PayBackPercent Collateral_unknown/none Purpose_education CreditStatus
Gender 1.0 0.3 0.7 0.1 -0.0 0.0 0.2 -0.2 0.1 0.0 ... 0.1 -0.0 -0.0 -0.1 0.0 0.1 0.1 0.1 -0.1 0.1
Age 0.3 1.0 0.2 0.0 -0.1 0.0 0.2 -0.3 0.2 -0.0 ... 0.0 -0.0 0.0 -0.0 0.1 0.1 0.1 0.1 0.0 0.1
Marital_Status 0.7 0.2 1.0 0.1 -0.1 0.1 0.3 -0.2 0.1 -0.0 ... 0.2 -0.1 -0.0 -0.1 0.0 0.1 0.1 0.2 -0.0 0.1
NumMonths 0.1 0.0 0.1 1.0 -0.0 0.1 -0.0 -0.1 0.1 -0.0 ... 0.6 -0.2 0.0 -0.0 0.1 -0.1 0.1 0.2 0.0 -0.2
Savings_<500 -0.0 -0.1 -0.1 -0.0 1.0 -0.7 -0.0 0.0 0.0 0.1 ... -0.0 0.0 -0.1 -0.0 0.0 -0.0 -0.0 -0.0 -0.0 -0.2
Savings_none 0.0 0.0 0.1 0.1 -0.7 1.0 0.0 -0.0 0.0 -0.1 ... 0.1 -0.0 0.1 0.0 -0.0 0.0 0.0 0.0 0.0 0.1
Dependents 0.2 0.2 0.3 -0.0 -0.0 0.0 1.0 -0.1 -0.0 0.0 ... 0.0 0.0 -0.0 -0.0 0.0 0.0 -0.1 0.1 0.0 0.0
Property_rent -0.2 -0.3 -0.2 -0.1 0.0 -0.0 -0.1 1.0 -0.0 0.0 ... -0.0 0.0 -0.0 0.0 -0.1 -0.1 -0.1 -0.1 0.0 -0.1
Job_management/self-emp/officer/highly qualif emp 0.1 0.2 0.1 0.1 0.0 0.0 -0.0 -0.0 1.0 -0.1 ... 0.3 -0.2 0.0 -0.1 0.2 -0.0 0.0 0.2 -0.0 -0.0
Debtors_guarantor 0.0 -0.0 -0.0 -0.0 0.1 -0.1 0.0 0.0 -0.1 1.0 ... -0.1 0.2 -0.7 -0.0 0.0 -0.0 -0.0 -0.1 -0.1 0.1
Purpose_CarNew 0.0 0.1 0.0 -0.1 0.0 -0.0 0.1 -0.0 -0.0 -0.0 ... -0.0 0.0 0.0 0.1 -0.1 0.0 -0.0 0.0 -0.1 -0.1
Purpose_furniture/equip -0.1 -0.1 -0.1 -0.1 0.1 -0.1 -0.1 0.1 -0.0 -0.0 ... -0.0 -0.1 -0.0 -0.1 -0.1 -0.0 -0.1 -0.1 -0.1 -0.0
CreditHistory_none/paid -0.1 -0.2 -0.1 -0.0 0.0 -0.0 -0.0 0.1 -0.0 0.1 ... -0.0 -0.0 -0.1 0.0 -0.0 -0.8 -0.0 -0.0 -0.0 -0.2
Purpose_CarUsed 0.1 0.0 0.1 0.1 -0.1 0.1 0.1 0.0 0.2 -0.0 ... 0.3 -0.1 0.1 -0.0 -0.0 0.0 -0.1 0.1 -0.1 0.1
CreditAmount 0.1 0.0 0.2 0.6 -0.0 0.1 0.0 -0.0 0.3 -0.1 ... 1.0 -0.2 -0.0 -0.0 0.2 -0.0 -0.3 0.2 -0.0 -0.2
Collateral_real estate -0.0 -0.0 -0.1 -0.2 0.0 -0.0 0.0 0.0 -0.2 0.2 ... -0.2 1.0 -0.1 0.0 -0.0 0.0 -0.0 -0.3 -0.1 0.1
Debtors_none -0.0 0.0 -0.0 0.0 -0.1 0.1 -0.0 -0.0 0.0 -0.7 ... -0.0 -0.1 1.0 0.0 -0.1 0.0 0.0 0.0 0.1 0.0
Job_unemp/unskilled-non resident -0.1 -0.0 -0.1 -0.0 -0.0 0.0 -0.0 0.0 -0.1 -0.0 ... -0.0 0.0 0.0 1.0 0.0 0.0 -0.1 0.0 -0.0 -0.0
Purpose_others 0.0 0.1 0.0 0.1 0.0 -0.0 0.0 -0.1 0.2 0.0 ... 0.2 -0.0 -0.1 0.0 1.0 -0.0 -0.0 0.1 -0.0 -0.0
CreditHistory_other 0.1 0.1 0.1 -0.1 -0.0 0.0 0.0 -0.1 -0.0 -0.0 ... -0.0 0.0 0.0 0.0 -0.0 1.0 0.0 -0.0 0.0 0.2
PayBackPercent 0.1 0.1 0.1 0.1 -0.0 0.0 -0.1 -0.1 0.0 -0.0 ... -0.3 -0.0 0.0 -0.1 -0.0 0.0 1.0 0.0 0.0 -0.1
Collateral_unknown/none 0.1 0.1 0.2 0.2 -0.0 0.0 0.1 -0.1 0.2 -0.1 ... 0.2 -0.3 0.0 0.0 0.1 -0.0 0.0 1.0 0.2 -0.1
Purpose_education -0.1 0.0 -0.0 0.0 -0.0 0.0 0.0 0.0 -0.0 -0.1 ... -0.0 -0.1 0.1 -0.0 -0.0 0.0 0.0 0.2 1.0 -0.1
CreditStatus 0.1 0.1 0.1 -0.2 -0.2 0.1 0.0 -0.1 -0.0 0.1 ... -0.2 0.1 0.0 -0.0 -0.0 0.2 -0.1 -0.1 -0.1 1.0

24 rows × 24 columns

In [18]:
klib.corr_plot(german_xai_imp,annot=False)
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x2837e91b5c8>
In [19]:
klib.corr_plot(german_xai_imp,target='CreditStatus')
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x2837e7ed108>

No higher correlation is observed between input variables (except gender, marital status (0.7) and credit amount, num of months (0.6) and between target variable and input variables. But since we are trying to understand the impact of protected variables let us retain them without dropping.

writing data to csv file

In [20]:
german_xai_imp.to_csv('C:/Users/krish/Downloads/German-reduced.csv', index=False)
In [21]:
#from sklearn.feature_selection import SelectPercentile
#selected_top=SelectPercentile(score_func=mutual_info_classif,  percentile=20)
#from sklearn.feature_selection import SelectKBest
#selected_top=SelectKBest(mutual_info_classif,k=10)
#selected_top.fit_transform(X_train,y_train)
In [22]:
#selected_top.fit_transform(X_train,y_train)
In [23]:
#X_sig=X_train.columns[selected_top.get_support()]
In [24]:
#X_sig
In [25]:
#X_train_sig=pd.DataFrame(X_train,columns=X_sig)
#X_test_sig=pd.DataFrame(X_test,columns=X_sig)
#X_train_sig.head()
#X_train_sig.shape
#X_test_sig.head()
#X_test_sig.shape

List of protected attributes

(https://arxiv.org/pdf/1811.11154.pdf)

In [26]:
from IPython.display  import Image
Image(filename='C:/Users/krish/Desktop/list of protected variables.png',width=500,height=30)
Out[26]:

From the above, we have 3 protected fields in our dataset:

1. Gender
2. Age
3. Marital Status

Now, let us identify previlege class in each protected attribute.

1.Gender

In [27]:
print(german_xai_imp['Gender'].value_counts())
german_xai_imp.groupby(['Gender'])['CreditStatus'].mean()
#https://arxiv.org/pdf/1810.01943.pdf, https://arxiv.org/pdf/2005.12379.pdf
1    690
0    310
Name: Gender, dtype: int64
Out[27]:
Gender
0    0.648387
1    0.723188
Name: CreditStatus, dtype: float64

Males(1) are more than females and for males(1) target variable CreditScore is more favorable having higher value for given number of males than female group average. Hence male(1) is privelieged class.

2.Age

In [28]:
print(german_xai_imp['Age'].value_counts())
german_xai_imp.groupby(['Age'])['CreditStatus'].mean()
1    810
0    190
Name: Age, dtype: int64
Out[28]:
Age
0    0.578947
1    0.728395
Name: CreditStatus, dtype: float64

Age >26: 1; else 0; so ppl above 26 are more and group average of ppl with age >26 is higher than the group of age < 26 ,so age(1) is priveleiged group

3. Marital Status

In [29]:
print(german_xai_imp['Marital_Status'].value_counts())
german_xai_imp.groupby(['Marital_Status'])['CreditStatus'].mean()
1    548
0    452
Name: Marital_Status, dtype: int64
Out[29]:
Marital_Status
0    0.659292
1    0.733577
Name: CreditStatus, dtype: float64

Singles(1) are more than not singles and for singles(1) target variable CreditScore is more favorable having higher value for given number of singles than non singles group average. Hence singles(1) is privelieged group

Converting Dataframe to aif compatible format

BinaryLabelDataset: Base class for all structured datasets with binary labels.

In [30]:
# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.explainers import MetricTextExplainer
from aif360.metrics import ClassificationMetric
# Get DF into IBM format
from aif360 import datasets
aif_train_dataset = datasets.BinaryLabelDataset(favorable_label = 1, unfavorable_label = 0, df=german_xai_imp,
                                                      label_names=["CreditStatus"],
                                                     protected_attribute_names=["Age","Gender","Marital_Status"],
                                              privileged_protected_attributes = [1,1,1])
dataset_orig_train, dataset_orig_test = aif_train_dataset.split([0.7], shuffle=True)
In [31]:
dataset_orig_train.feature_names
Out[31]:
['Gender',
 'Age',
 'Marital_Status',
 'NumMonths',
 'Savings_<500',
 'Savings_none',
 'Dependents',
 'Property_rent',
 'Job_management/self-emp/officer/highly qualif emp',
 'Debtors_guarantor',
 'Purpose_CarNew',
 'Purpose_furniture/equip',
 'CreditHistory_none/paid',
 'Purpose_CarUsed',
 'CreditAmount',
 'Collateral_real estate',
 'Debtors_none',
 'Job_unemp/unskilled-non resident',
 'Purpose_others',
 'CreditHistory_other',
 'PayBackPercent',
 'Collateral_unknown/none',
 'Purpose_education']

Measuring fairness:

Disparate Impact

a) With respect to Gender

In [32]:
# Disparate impact measurement for gender
metric_aif_train_ready_gender = BinaryLabelDatasetMetric(
        aif_train_dataset,
        unprivileged_groups=[{"Age":0,"Gender":0,"Marital_Status":0}],  privileged_groups=[{"Age":1,"Gender":1,"Marital_Status":1}]) 
explainer_aif_train_ready_gender = MetricTextExplainer(metric_aif_train_ready_gender)

print(explainer_aif_train_ready_gender.disparate_impact())
print("Difference in mean outcomes between unprivileged and privileged groups of gender = %f" % metric_aif_train_ready_gender.mean_difference())
Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.7385093167701864
Difference in mean outcomes between unprivileged and privileged groups of gender = -0.195587

Handling bias: Reweighing

In [33]:
from aif360.algorithms.preprocessing import Reweighing
privileged_groups = [{'Gender': 1}]
unprivileged_groups = [{'Gender': 0}]
RW_gender = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_aif_tranf_gender = RW_gender.fit_transform(dataset_orig_train)
metric_transf_train_gender = BinaryLabelDatasetMetric(dataset_aif_tranf_gender, 
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)

print("Difference in mean outcomes between unprivileged and privileged groups of gender = %f" % metric_transf_train_gender.mean_difference())
WARNING:root:No module named 'numba.decorators': LFR will be unavailable. To install, run:
pip install 'aif360[LFR]'
Difference in mean outcomes between unprivileged and privileged groups of gender = 0.000000

b) with respect to Age

In [34]:
# Disparate impact measurement for age
metric_aif_train_ready_age = BinaryLabelDatasetMetric(
        aif_train_dataset,
        unprivileged_groups=[{"Age":0}],
        privileged_groups=[{"Age":1}])
explainer_aif_train_ready_age = MetricTextExplainer(metric_aif_train_ready_age)

print(explainer_aif_train_ready_age.disparate_impact())
print("Difference in mean outcomes between unprivileged and privileged groups of age = %f" % metric_aif_train_ready_age.mean_difference())
Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.7948260481712757
Difference in mean outcomes between unprivileged and privileged groups of age = -0.149448

Handling bias: Reweighing

In [35]:
from aif360.algorithms.preprocessing import Reweighing
privileged_groups = [{'Age': 1}]
unprivileged_groups = [{'Age': 0}]
RW_age = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_aif_tranf_age = RW_age.fit_transform(dataset_orig_train)
metric_transf_train_age = BinaryLabelDatasetMetric(dataset_aif_tranf_age, 
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)

print("Difference in mean outcomes between unprivileged and privileged groups of age = %f" % metric_transf_train_age.mean_difference())
Difference in mean outcomes between unprivileged and privileged groups of age = -0.000000
In [36]:
metric_transf_train_age
Out[36]:
<aif360.metrics.binary_label_dataset_metric.BinaryLabelDatasetMetric at 0x2837ed139c8>

c) with respect to Marital Status

In [37]:
# Disparate impact measurement for age
metric_aif_train_ready_marital = BinaryLabelDatasetMetric(
        aif_train_dataset,
        unprivileged_groups=[{"Marital_Status":0}],
        privileged_groups=[{"Marital_Status":1}])
explainer_aif_train_ready_marital = MetricTextExplainer(metric_aif_train_ready_marital)

print(explainer_aif_train_ready_marital.disparate_impact())
print("Difference in mean outcomes between unprivileged and privileged groups of marital status = %f" % metric_aif_train_ready_marital.mean_difference())
Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.8987364064632589
Difference in mean outcomes between unprivileged and privileged groups of marital status = -0.074285

Handling bias: Reweighing

In [38]:
from aif360.algorithms.preprocessing import Reweighing
privileged_groups = [{'Marital_Status': 1}]
unprivileged_groups = [{'Marital_Status': 0}]
RW_Marital = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_aif_tranf_marital = RW_Marital.fit_transform(dataset_orig_train)
metric_transf_train_marital = BinaryLabelDatasetMetric(dataset_aif_tranf_marital, 
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)

print("Difference in mean outcomes between unprivileged and privileged groups of marital status = %f" % metric_transf_train_marital.mean_difference())
Difference in mean outcomes between unprivileged and privileged groups of marital status = -0.000000

Building a ML model

1.RANDOM FOREST

In [39]:
#Seting the Hyper Parameters
param_grid = {"max_depth": [3,5,7,None],
              "n_estimators":[3,5,10,15,20],
              "max_features": [4,7,15]}
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#Creating the classifier
rf_model = RandomForestClassifier(random_state=40)
grid_search = GridSearchCV(rf_model, param_grid=param_grid, cv=5, scoring='recall', verbose=0)
model = grid_search

1.a) with age as protected variable in the dataset

In [40]:
mdl_age = model.fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
In [41]:
rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_age.features)
WARNING:shap:Using 700 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.

1.a.1) Model feature importance

In [42]:
importances = model.best_estimator_.feature_importances_
indices = np.argsort(importances)
features = dataset_aif_tranf_age.feature_names
#https://stackoverflow.com/questions/48377296/get-feature-importance-from-gridsearchcv
In [43]:
importances

plt.figure(figsize=(20,30))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Out[43]:
array([0.03001171, 0.08958683, 0.        , 0.20791188, 0.12146826,
       0.01620622, 0.00799866, 0.04957083, 0.01319578, 0.02144778,
       0.06537335, 0.00328254, 0.09877265, 0.02824589, 0.09275162,
       0.01563921, 0.00568471, 0.        , 0.        , 0.07887251,
       0.0289633 , 0.02128787, 0.00372841])
Out[43]:
<Figure size 1440x2160 with 0 Axes>
Out[43]:
Text(0.5, 1.0, 'Feature Importances')
Out[43]:
<BarContainer object of 23 artists>
Out[43]:
([<matplotlib.axis.YTick at 0x2837f09ae08>,
  <matplotlib.axis.YTick at 0x2837f0abbc8>,
  <matplotlib.axis.YTick at 0x2837f0abe08>,
  <matplotlib.axis.YTick at 0x2837f0f4348>,
  <matplotlib.axis.YTick at 0x2837f0f3188>,
  <matplotlib.axis.YTick at 0x2837f0f3908>,
  <matplotlib.axis.YTick at 0x2837f0fa048>,
  <matplotlib.axis.YTick at 0x2837f0fab08>,
  <matplotlib.axis.YTick at 0x2837f0ff408>,
  <matplotlib.axis.YTick at 0x2837f0ffcc8>,
  <matplotlib.axis.YTick at 0x2837f103648>,
  <matplotlib.axis.YTick at 0x2837f103748>,
  <matplotlib.axis.YTick at 0x2837f0f37c8>,
  <matplotlib.axis.YTick at 0x2837f1073c8>,
  <matplotlib.axis.YTick at 0x2837f107a88>,
  <matplotlib.axis.YTick at 0x2837ed59608>,
  <matplotlib.axis.YTick at 0x2837f0a0188>,
  <matplotlib.axis.YTick at 0x2837f0a0bc8>,
  <matplotlib.axis.YTick at 0x2837f10a6c8>,
  <matplotlib.axis.YTick at 0x2837f10d248>,
  <matplotlib.axis.YTick at 0x2837f10dc88>,
  <matplotlib.axis.YTick at 0x2837f10ad88>,
  <matplotlib.axis.YTick at 0x2837f111488>],
 <a list of 23 Text yticklabel objects>)
Out[43]:
Text(0.5, 0, 'Relative Importance')

Features that are important in the model are given above.

1.a.2) Model Explainability

1.a.2.a) Using SHAP

In [44]:
mdl_age.best_params_
type(model)
explainer = shap.TreeExplainer(grid_search.best_estimator_)
shap_values_a=explainer.shap_values(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
#https://github.com/slundberg/shap/issues/968
Out[44]:
{'max_depth': 3, 'max_features': 7, 'n_estimators': 15}
Out[44]:
sklearn.model_selection._search.GridSearchCV
In [45]:
shap_values_a
Out[45]:
[array([[-7.30389951e-03, -1.00577214e-02,  0.00000000e+00, ...,
          3.45390611e-03, -4.39114004e-03, -2.24420701e-04],
        [-8.80738629e-03, -1.08102757e-02,  0.00000000e+00, ...,
         -2.01386014e-02, -2.94004368e-03, -5.51378446e-05],
        [-6.56476908e-03, -9.64806980e-03,  0.00000000e+00, ...,
          9.38034498e-04, -2.94004368e-03, -5.51378446e-05],
        ...,
        [ 1.94415910e-02,  3.28094137e-02,  0.00000000e+00, ...,
          3.40485610e-03, -1.42056009e-03,  2.54047619e-03],
        [-9.56934393e-03, -1.04126901e-02,  0.00000000e+00, ...,
          3.85941459e-03, -3.56628377e-03, -1.33709273e-04],
        [ 7.89535165e-03, -1.18534551e-02,  0.00000000e+00, ...,
         -1.11277340e-02,  1.07315468e-02, -5.51378446e-05]]),
 array([[ 7.30389951e-03,  1.00577214e-02,  0.00000000e+00, ...,
         -3.45390611e-03,  4.39114004e-03,  2.24420701e-04],
        [ 8.80738629e-03,  1.08102757e-02,  0.00000000e+00, ...,
          2.01386014e-02,  2.94004368e-03,  5.51378446e-05],
        [ 6.56476908e-03,  9.64806980e-03,  0.00000000e+00, ...,
         -9.38034498e-04,  2.94004368e-03,  5.51378446e-05],
        ...,
        [-1.94415910e-02, -3.28094137e-02,  0.00000000e+00, ...,
         -3.40485610e-03,  1.42056009e-03, -2.54047619e-03],
        [ 9.56934393e-03,  1.04126901e-02,  0.00000000e+00, ...,
         -3.85941459e-03,  3.56628377e-03,  1.33709273e-04],
        [-7.89535165e-03,  1.18534551e-02,  0.00000000e+00, ...,
          1.11277340e-02, -1.07315468e-02,  5.51378446e-05]])]

The shap_values[0] are explanations with respect to the negative class, while shap_values[1] are explanations with respect to the positive class.

Features in blue pushes the base value towards lowest values and features in red moves base levels towards higher values.

In [46]:
shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values_a[0][0], dataset_aif_tranf_age.feature_names)
#https://github.com/slundberg/shap
#https://github.com/slundberg/shap/issues/279
Out[46]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [47]:
shap.initjs()
shap.force_plot(explainer.expected_value[1],shap_values_a[1][0], dataset_aif_tranf_age.feature_names)
Out[47]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [48]:
dataset_aif_tranf_age.feature_names
Out[48]:
['Gender',
 'Age',
 'Marital_Status',
 'NumMonths',
 'Savings_<500',
 'Savings_none',
 'Dependents',
 'Property_rent',
 'Job_management/self-emp/officer/highly qualif emp',
 'Debtors_guarantor',
 'Purpose_CarNew',
 'Purpose_furniture/equip',
 'CreditHistory_none/paid',
 'Purpose_CarUsed',
 'CreditAmount',
 'Collateral_real estate',
 'Debtors_none',
 'Job_unemp/unskilled-non resident',
 'Purpose_others',
 'CreditHistory_other',
 'PayBackPercent',
 'Collateral_unknown/none',
 'Purpose_education']
In [49]:
shap.force_plot(explainer.expected_value[0],
                shap_values_a[0][:,:], dataset_aif_tranf_age.features[:,:],feature_names = dataset_aif_tranf_age.feature_names)
Out[49]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [50]:
shap.force_plot(explainer.expected_value[1],
                shap_values_a[1][:,:], dataset_aif_tranf_age.features[:,:],feature_names = dataset_aif_tranf_age.feature_names)
Out[50]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [51]:
p = shap.summary_plot(shap_values_a, dataset_aif_tranf_age.features, feature_names=dataset_aif_tranf_age.feature_names) 
display(p)
None

Variables with higher impact are Age,CreditAmount,NumMonths,Savings etc

In [52]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0], shap_values_a[0][0],feature_names=dataset_aif_tranf_age.feature_names)

Interpretation of graph: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html

The above explanation shows features each contributing to push the model output from the base value (the average model output over the training dataset we passed) to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue.

f(x)- model output impacted by features; E(f(x))- expected output.

One the fundemental properties of Shapley values is that they always sum up to the difference between the game outcome when all players are present and the game outcome when no players are present. For machine learning models this means that SHAP values of all the input features will always sum up to the difference between baseline (expected) model output and the current model output for the prediction being explained.

In [53]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], shap_values_a[1][0],feature_names=dataset_aif_tranf_age.feature_names)

1.a.2.b) Using eli5

In [54]:
#!pip install eli5
from eli5.sklearn import PermutationImportance
In [55]:
perm_age = PermutationImportance(model).fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
In [56]:
perm_imp_1=eli5.show_weights(perm_age,feature_names = dataset_aif_tranf_age.feature_names)
perm_imp_1
plt.show()
Out[56]:
Weight Feature
0.0041 ± 0.0026 Savings_<500
0.0041 ± 0.0052 PayBackPercent
0.0037 ± 0.0054 NumMonths
0.0033 ± 0.0042 Age
0.0024 ± 0.0031 Collateral_real estate
0.0016 ± 0.0031 Gender
0.0016 ± 0.0016 Purpose_CarNew
0.0016 ± 0.0016 CreditHistory_none/paid
0.0016 ± 0.0031 CreditAmount
0.0012 ± 0.0020 Purpose_furniture/equip
0.0008 ± 0.0033 Property_rent
0.0008 ± 0.0020 CreditHistory_other
0.0008 ± 0.0020 Collateral_unknown/none
0 ± 0.0000 Dependents
0 ± 0.0000 Savings_none
0 ± 0.0000 Purpose_education
0 ± 0.0000 Purpose_CarUsed
0 ± 0.0000 Debtors_guarantor
0 ± 0.0000 Marital_Status
0 ± 0.0000 Debtors_none
… 3 more …

eli5 provides a way to compute feature importances for any black-box estimator by measuring how score decreases when a feature is not available; the method is also known as “permutation importance” or “Mean Decrease Accuracy (MDA)”.

The first number in each row shows how much model performance decreased with a random shuffling (in this case, using "accuracy" as the performance metric).

Like most things in data science, there is some randomness to the exact performance change from a shuffling a column. We measure the amount of randomness in our permutation importance calculation by repeating the process with multiple shuffles. The number after the ± measures how performance varied from one-reshuffling to the next.

You'll occasionally see negative values for permutation importances. In those cases, the predictions on the shuffled (or noisy) data happened to be more accurate than the real data. This happens when the feature didn't matter (should have had an importance close to 0), but random chance caused the predictions on shuffled data to be more accurate. This is more common with small datasets, like the one in this example, because there is more room for luck/chance.

https://www.kaggle.com/dansbecker/permutation-importance

1.b) with gender as protected variable in the dataset

In [57]:
mdl_gender = model.fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
In [58]:
rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_gender.features)
WARNING:shap:Using 700 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.

1.b.1) Model feature importance

In [59]:
importances = model.best_estimator_.feature_importances_
indices = np.argsort(importances)
features = dataset_aif_tranf_gender.feature_names
#https://stackoverflow.com/questions/48377296/get-feature-importance-from-gridsearchcv
In [60]:
importances

plt.figure(figsize=(20,30))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Out[60]:
array([0.03001171, 0.08958683, 0.        , 0.20791188, 0.12146826,
       0.01620622, 0.00799866, 0.04957083, 0.01319578, 0.02144778,
       0.06537335, 0.00328254, 0.09877265, 0.02824589, 0.09275162,
       0.01563921, 0.00568471, 0.        , 0.        , 0.07887251,
       0.0289633 , 0.02128787, 0.00372841])
Out[60]:
<Figure size 1440x2160 with 0 Axes>
Out[60]:
Text(0.5, 1.0, 'Feature Importances')
Out[60]:
<BarContainer object of 23 artists>
Out[60]:
([<matplotlib.axis.YTick at 0x2837ed27248>,
  <matplotlib.axis.YTick at 0x28301073bc8>,
  <matplotlib.axis.YTick at 0x283011fe448>,
  <matplotlib.axis.YTick at 0x28301051d48>,
  <matplotlib.axis.YTick at 0x28301078fc8>,
  <matplotlib.axis.YTick at 0x28301078f88>,
  <matplotlib.axis.YTick at 0x2830106b448>,
  <matplotlib.axis.YTick at 0x2830106bbc8>,
  <matplotlib.axis.YTick at 0x283010789c8>,
  <matplotlib.axis.YTick at 0x2830107b688>,
  <matplotlib.axis.YTick at 0x2830107ba48>,
  <matplotlib.axis.YTick at 0x28301080708>,
  <matplotlib.axis.YTick at 0x283005e1648>,
  <matplotlib.axis.YTick at 0x28301037048>,
  <matplotlib.axis.YTick at 0x28301055a08>,
  <matplotlib.axis.YTick at 0x28301058948>,
  <matplotlib.axis.YTick at 0x2830105c548>,
  <matplotlib.axis.YTick at 0x28301201388>,
  <matplotlib.axis.YTick at 0x28301058f88>,
  <matplotlib.axis.YTick at 0x28301080a48>,
  <matplotlib.axis.YTick at 0x283011f8ac8>,
  <matplotlib.axis.YTick at 0x2830124e448>,
  <matplotlib.axis.YTick at 0x28301251088>],
 <a list of 23 Text yticklabel objects>)
Out[60]:
Text(0.5, 0, 'Relative Importance')

1.b.2) Model Explainability

1.b.2.a) Using SHAP

In [61]:
mdl_gender.best_params_
type(model)
explainer = shap.TreeExplainer(grid_search.best_estimator_)
shap_values_b=explainer.shap_values(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
#https://github.com/slundberg/shap/issues/968
Out[61]:
{'max_depth': 3, 'max_features': 7, 'n_estimators': 15}
Out[61]:
sklearn.model_selection._search.GridSearchCV
In [62]:
shap_values_b
Out[62]:
[array([[-7.30389951e-03, -1.00577214e-02,  0.00000000e+00, ...,
          3.45390611e-03, -4.39114004e-03, -2.24420701e-04],
        [-8.80738629e-03, -1.08102757e-02,  0.00000000e+00, ...,
         -2.01386014e-02, -2.94004368e-03, -5.51378446e-05],
        [-6.56476908e-03, -9.64806980e-03,  0.00000000e+00, ...,
          9.38034498e-04, -2.94004368e-03, -5.51378446e-05],
        ...,
        [ 1.94415910e-02,  3.28094137e-02,  0.00000000e+00, ...,
          3.40485610e-03, -1.42056009e-03,  2.54047619e-03],
        [-9.56934393e-03, -1.04126901e-02,  0.00000000e+00, ...,
          3.85941459e-03, -3.56628377e-03, -1.33709273e-04],
        [ 7.89535165e-03, -1.18534551e-02,  0.00000000e+00, ...,
         -1.11277340e-02,  1.07315468e-02, -5.51378446e-05]]),
 array([[ 7.30389951e-03,  1.00577214e-02,  0.00000000e+00, ...,
         -3.45390611e-03,  4.39114004e-03,  2.24420701e-04],
        [ 8.80738629e-03,  1.08102757e-02,  0.00000000e+00, ...,
          2.01386014e-02,  2.94004368e-03,  5.51378446e-05],
        [ 6.56476908e-03,  9.64806980e-03,  0.00000000e+00, ...,
         -9.38034498e-04,  2.94004368e-03,  5.51378446e-05],
        ...,
        [-1.94415910e-02, -3.28094137e-02,  0.00000000e+00, ...,
         -3.40485610e-03,  1.42056009e-03, -2.54047619e-03],
        [ 9.56934393e-03,  1.04126901e-02,  0.00000000e+00, ...,
         -3.85941459e-03,  3.56628377e-03,  1.33709273e-04],
        [-7.89535165e-03,  1.18534551e-02,  0.00000000e+00, ...,
          1.11277340e-02, -1.07315468e-02,  5.51378446e-05]])]
In [63]:
shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values_b[0][0], dataset_aif_tranf_gender.feature_names)
#https://github.com/slundberg/shap
#https://github.com/slundberg/shap/issues/279
Out[63]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

The shap_values[0] are explanations with respect to the negative class, while shap_values[1] are explanations with respect to the positive class.

In [64]:
shap.initjs()
shap.force_plot(explainer.expected_value[1],shap_values_b[1][0], dataset_aif_tranf_gender.feature_names)
Out[64]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [65]:
dataset_aif_tranf_gender.feature_names
Out[65]:
['Gender',
 'Age',
 'Marital_Status',
 'NumMonths',
 'Savings_<500',
 'Savings_none',
 'Dependents',
 'Property_rent',
 'Job_management/self-emp/officer/highly qualif emp',
 'Debtors_guarantor',
 'Purpose_CarNew',
 'Purpose_furniture/equip',
 'CreditHistory_none/paid',
 'Purpose_CarUsed',
 'CreditAmount',
 'Collateral_real estate',
 'Debtors_none',
 'Job_unemp/unskilled-non resident',
 'Purpose_others',
 'CreditHistory_other',
 'PayBackPercent',
 'Collateral_unknown/none',
 'Purpose_education']
In [66]:
shap.force_plot(explainer.expected_value[0],
                shap_values_b[0][:,:], dataset_aif_tranf_gender.features[:,:],feature_names = dataset_aif_tranf_gender.feature_names)
Out[66]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [67]:
shap.force_plot(explainer.expected_value[1],
                shap_values_b[1][:,:], dataset_aif_tranf_gender.features[:,:],feature_names = dataset_aif_tranf_gender.feature_names)
Out[67]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [68]:
p = shap.summary_plot(shap_values_b, dataset_aif_tranf_gender.features, feature_names=dataset_aif_tranf_gender.feature_names) 
display(p)
None
In [69]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0], shap_values_b[0][0],feature_names=dataset_aif_tranf_gender.feature_names)

Interpretation of graph: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html

f(x)- model output impacted by features; E(f(x))- expected output.

The above explanation shows features each contributing to push the model output from the base value (the average model output over the training dataset we passed) to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue.

One the fundemental properties of Shapley values is that they always sum up to the difference between the game outcome when all players are present and the game outcome when no players are present. For machine learning models this means that SHAP values of all the input features will always sum up to the difference between baseline (expected) model output and the current model output for the prediction being explained.

In [70]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], shap_values_b[1][0],feature_names=dataset_aif_tranf_gender.feature_names)

1.b.2.b) Using ELI5

In [71]:
#!pip install eli5
from eli5.sklearn import PermutationImportance
In [72]:
perm_gender = PermutationImportance(model).fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
In [73]:
perm_imp_2=eli5.show_weights(perm_gender,feature_names = dataset_aif_tranf_gender.feature_names)
perm_imp_2
plt.show()
Out[73]:
Weight Feature
0.0053 ± 0.0033 NumMonths
0.0037 ± 0.0031 Savings_<500
0.0024 ± 0.0016 PayBackPercent
0.0024 ± 0.0040 Age
0.0020 ± 0.0000 CreditHistory_none/paid
0.0016 ± 0.0031 Collateral_real estate
0.0016 ± 0.0031 CreditAmount
0.0012 ± 0.0020 Gender
0.0012 ± 0.0020 Purpose_CarNew
0.0008 ± 0.0020 Collateral_unknown/none
0.0008 ± 0.0020 Purpose_furniture/equip
0.0004 ± 0.0016 Property_rent
0 ± 0.0000 Dependents
0 ± 0.0000 CreditHistory_other
0 ± 0.0000 Savings_none
0 ± 0.0000 Purpose_education
0 ± 0.0000 Purpose_CarUsed
0 ± 0.0000 Debtors_guarantor
0 ± 0.0000 Marital_Status
0 ± 0.0000 Debtors_none
… 3 more …

eli5 provides a way to compute feature importances for any black-box estimator by measuring how score decreases when a feature is not available; the method is also known as “permutation importance” or “Mean Decrease Accuracy (MDA)”.

1.c) with marital status as protected variable in the dataset

In [74]:
mdl_marital = model.fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
In [75]:
rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_marital.features)
WARNING:shap:Using 700 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.

1.c.1) Model feature importance

In [76]:
importances = model.best_estimator_.feature_importances_
indices = np.argsort(importances)
features = dataset_aif_tranf_marital.feature_names
#https://stackoverflow.com/questions/48377296/get-feature-importance-from-gridsearchcv
In [77]:
importances

plt.figure(figsize=(20,30))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Out[77]:
array([0.03001171, 0.08958683, 0.        , 0.20791188, 0.12146826,
       0.01620622, 0.00799866, 0.04957083, 0.01319578, 0.02144778,
       0.06537335, 0.00328254, 0.09877265, 0.02824589, 0.09275162,
       0.01563921, 0.00568471, 0.        , 0.        , 0.07887251,
       0.0289633 , 0.02128787, 0.00372841])
Out[77]:
<Figure size 1440x2160 with 0 Axes>
Out[77]:
Text(0.5, 1.0, 'Feature Importances')
Out[77]:
<BarContainer object of 23 artists>
Out[77]:
([<matplotlib.axis.YTick at 0x28304003f88>,
  <matplotlib.axis.YTick at 0x283012b6948>,
  <matplotlib.axis.YTick at 0x28304154ac8>,
  <matplotlib.axis.YTick at 0x28303fb7348>,
  <matplotlib.axis.YTick at 0x28303fcfdc8>,
  <matplotlib.axis.YTick at 0x28303fd5548>,
  <matplotlib.axis.YTick at 0x28303fd5908>,
  <matplotlib.axis.YTick at 0x28303fd5a48>,
  <matplotlib.axis.YTick at 0x28303fd7548>,
  <matplotlib.axis.YTick at 0x28303fdfc48>,
  <matplotlib.axis.YTick at 0x28303fdf8c8>,
  <matplotlib.axis.YTick at 0x28303fc3bc8>,
  <matplotlib.axis.YTick at 0x28303fc7dc8>,
  <matplotlib.axis.YTick at 0x28303fe9648>,
  <matplotlib.axis.YTick at 0x28303fb0c88>,
  <matplotlib.axis.YTick at 0x28303fbe348>,
  <matplotlib.axis.YTick at 0x28303fefc08>,
  <matplotlib.axis.YTick at 0x28303fbe688>,
  <matplotlib.axis.YTick at 0x28303fd7b88>,
  <matplotlib.axis.YTick at 0x28303fedb48>,
  <matplotlib.axis.YTick at 0x28303fedbc8>,
  <matplotlib.axis.YTick at 0x28303fab9c8>,
  <matplotlib.axis.YTick at 0x28303fafe88>],
 <a list of 23 Text yticklabel objects>)
Out[77]:
Text(0.5, 0, 'Relative Importance')

1.c.2) Model Explainability

1.c.2.a) Using SHAP

In [78]:
mdl_gender.best_params_
type(model)
explainer = shap.TreeExplainer(grid_search.best_estimator_)
shap_values_c=explainer.shap_values(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
#https://github.com/slundberg/shap/issues/968
Out[78]:
{'max_depth': 3, 'max_features': 7, 'n_estimators': 15}
Out[78]:
sklearn.model_selection._search.GridSearchCV
In [79]:
shap_values_c
Out[79]:
[array([[-7.30389951e-03, -1.00577214e-02,  0.00000000e+00, ...,
          3.45390611e-03, -4.39114004e-03, -2.24420701e-04],
        [-8.80738629e-03, -1.08102757e-02,  0.00000000e+00, ...,
         -2.01386014e-02, -2.94004368e-03, -5.51378446e-05],
        [-6.56476908e-03, -9.64806980e-03,  0.00000000e+00, ...,
          9.38034498e-04, -2.94004368e-03, -5.51378446e-05],
        ...,
        [ 1.94415910e-02,  3.28094137e-02,  0.00000000e+00, ...,
          3.40485610e-03, -1.42056009e-03,  2.54047619e-03],
        [-9.56934393e-03, -1.04126901e-02,  0.00000000e+00, ...,
          3.85941459e-03, -3.56628377e-03, -1.33709273e-04],
        [ 7.89535165e-03, -1.18534551e-02,  0.00000000e+00, ...,
         -1.11277340e-02,  1.07315468e-02, -5.51378446e-05]]),
 array([[ 7.30389951e-03,  1.00577214e-02,  0.00000000e+00, ...,
         -3.45390611e-03,  4.39114004e-03,  2.24420701e-04],
        [ 8.80738629e-03,  1.08102757e-02,  0.00000000e+00, ...,
          2.01386014e-02,  2.94004368e-03,  5.51378446e-05],
        [ 6.56476908e-03,  9.64806980e-03,  0.00000000e+00, ...,
         -9.38034498e-04,  2.94004368e-03,  5.51378446e-05],
        ...,
        [-1.94415910e-02, -3.28094137e-02,  0.00000000e+00, ...,
         -3.40485610e-03,  1.42056009e-03, -2.54047619e-03],
        [ 9.56934393e-03,  1.04126901e-02,  0.00000000e+00, ...,
         -3.85941459e-03,  3.56628377e-03,  1.33709273e-04],
        [-7.89535165e-03,  1.18534551e-02,  0.00000000e+00, ...,
          1.11277340e-02, -1.07315468e-02,  5.51378446e-05]])]
In [80]:
shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values_c[0][0], dataset_aif_tranf_marital.feature_names)
#https://github.com/slundberg/shap
#https://github.com/slundberg/shap/issues/279
Out[80]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

The shap_values[0] are explanations with respect to the negative class, while shap_values[1] are explanations with respect to the positive class.

In [81]:
shap.initjs()
shap.force_plot(explainer.expected_value[1],shap_values_c[1][0], dataset_aif_tranf_marital.feature_names)
Out[81]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [82]:
dataset_aif_tranf_marital.feature_names
Out[82]:
['Gender',
 'Age',
 'Marital_Status',
 'NumMonths',
 'Savings_<500',
 'Savings_none',
 'Dependents',
 'Property_rent',
 'Job_management/self-emp/officer/highly qualif emp',
 'Debtors_guarantor',
 'Purpose_CarNew',
 'Purpose_furniture/equip',
 'CreditHistory_none/paid',
 'Purpose_CarUsed',
 'CreditAmount',
 'Collateral_real estate',
 'Debtors_none',
 'Job_unemp/unskilled-non resident',
 'Purpose_others',
 'CreditHistory_other',
 'PayBackPercent',
 'Collateral_unknown/none',
 'Purpose_education']
In [83]:
shap.force_plot(explainer.expected_value[0],
                shap_values_c[0][:,:], dataset_aif_tranf_marital.features[:,:],feature_names = dataset_aif_tranf_marital.feature_names)
Out[83]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [84]:
shap.force_plot(explainer.expected_value[1],
                shap_values_c[1][:,:], dataset_aif_tranf_marital.features[:,:],feature_names = dataset_aif_tranf_marital.feature_names)
Out[84]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [85]:
p = shap.summary_plot(shap_values_c, dataset_aif_tranf_marital.features, feature_names=dataset_aif_tranf_marital.feature_names) 
display(p)
None
In [86]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0], shap_values_c[0][0],feature_names=dataset_aif_tranf_marital.feature_names)

Interpretation of graph: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html

f(x)- model output impacted by features; E(f(x))- expected output.

The above explanation shows features each contributing to push the model output from the base value (the average model output over the training dataset we passed) to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue.

One the fundemental properties of Shapley values is that they always sum up to the difference between the game outcome when all players are present and the game outcome when no players are present. For machine learning models this means that SHAP values of all the input features will always sum up to the difference between baseline (expected) model output and the current model output for the prediction being explained.

In [87]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], shap_values_c[1][0],feature_names=dataset_aif_tranf_marital.feature_names)

1.c.2.a) Using ELI5

In [88]:
#!pip install eli5
from eli5.sklearn import PermutationImportance
In [89]:
perm_marital = PermutationImportance(model).fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
In [90]:
perm_imp_3=eli5.show_weights(perm_marital,feature_names = dataset_aif_tranf_marital.feature_names)
perm_imp_3
plt.show()
Out[90]:
Weight Feature
0.0049 ± 0.0033 Savings_<500
0.0049 ± 0.0033 NumMonths
0.0045 ± 0.0048 Age
0.0037 ± 0.0031 PayBackPercent
0.0024 ± 0.0016 Collateral_real estate
0.0020 ± 0.0026 Gender
0.0016 ± 0.0031 Purpose_furniture/equip
0.0012 ± 0.0020 CreditHistory_none/paid
0.0008 ± 0.0020 CreditAmount
0.0008 ± 0.0020 Property_rent
0.0004 ± 0.0016 Collateral_unknown/none
0.0004 ± 0.0016 CreditHistory_other
0.0004 ± 0.0016 Job_management/self-emp/officer/highly qualif emp
0.0004 ± 0.0016 Purpose_CarNew
0 ± 0.0000 Marital_Status
0 ± 0.0000 Savings_none
0 ± 0.0000 Dependents
0 ± 0.0000 Debtors_none
0 ± 0.0000 Debtors_guarantor
0 ± 0.0000 Job_unemp/unskilled-non resident
… 3 more …

eli5 provides a way to compute feature importances for any black-box estimator by measuring how score decreases when a feature is not available; the method is also known as “permutation importance” or “Mean Decrease Accuracy (MDA)”.

There are several different ways to calculate feature importances. By default, “gain” is used, that is the average gain of the feature when it is used in trees.

2. XGBOOST

In [91]:
from xgboost import XGBClassifier
estimator = XGBClassifier(seed=40)

parameters = {
    'max_depth': range (2, 10, 2),
    'n_estimators': range(60, 240, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'recall',
    
    cv = 5,
    verbose=0
)

model=grid_search
In [92]:
#rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_age.features)

2.a) with age as protected variable

In [93]:
mdl_age = model.fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())

2.a.1) Model Explainability

2.a.1) Using SHAP

In [94]:
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.TreeExplainer(grid_search.best_estimator_,dataset_aif_tranf_age.features)
shap_values=explainer.shap_values(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
#https://github.com/slundberg/shap
In [95]:
shap_values
Out[95]:
array([[ 0.02358738,  0.06138272,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       [ 0.00327603,  0.08784266,  0.        , ...,  0.        ,
         0.        ,  0.00081661],
       [ 0.00327603,  0.04038655,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       ...,
       [-0.01179369, -0.13975128,  0.        , ...,  0.        ,
         0.        , -0.00840509],
       [ 0.00327603,  0.04038655,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       [-0.01179369,  0.12233785,  0.        , ...,  0.        ,
         0.        ,  0.00081661]])
In [96]:
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], dataset_aif_tranf_age.feature_names)
Out[96]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [97]:
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[1,:], dataset_aif_tranf_age.feature_names)
Out[97]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [98]:
shap.force_plot(explainer.expected_value, shap_values[:,:], X.iloc[:,:],feature_names = dataset_aif_tranf_age.feature_names)
Out[98]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [99]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[0,:],feature_names=dataset_aif_tranf_age.feature_names)

2.a.2) Using ELI5

In [100]:
perm_age = PermutationImportance(model).fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
perm_imp=eli5.show_weights(perm_age,feature_names = dataset_aif_tranf_age.feature_names)
perm_imp
plt.show()
Out[100]:
Weight Feature
0.0122 ± 0.0115 Age
0.0114 ± 0.0049 NumMonths
0.0045 ± 0.0040 Purpose_CarNew
0.0037 ± 0.0048 Savings_<500
0 ± 0.0000 Savings_none
0 ± 0.0000 Dependents
0 ± 0.0000 Property_rent
0 ± 0.0000 Job_management/self-emp/officer/highly qualif emp
0 ± 0.0000 Debtors_guarantor
0 ± 0.0000 Purpose_others
0 ± 0.0000 Purpose_furniture/equip
0 ± 0.0000 Purpose_CarUsed
0 ± 0.0000 Collateral_unknown/none
0 ± 0.0000 PayBackPercent
0 ± 0.0000 Job_unemp/unskilled-non resident
0 ± 0.0000 CreditHistory_none/paid
0 ± 0.0000 Debtors_none
0 ± 0.0000 Collateral_real estate
0 ± 0.0000 Marital_Status
0 ± 0.0000 Purpose_education
… 3 more …

2.b) gender as protected variable

2.b.1) Model Explainability

2.b.1) Using SHAP

In [101]:
mdl_gender = model.fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
In [102]:
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.TreeExplainer(grid_search.best_estimator_,dataset_aif_tranf_gender.features)
shap_values=explainer.shap_values(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
#https://github.com/slundberg/shap
In [103]:
shap_values
Out[103]:
array([[ 0.02358738,  0.06138272,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       [ 0.00327603,  0.08784266,  0.        , ...,  0.        ,
         0.        ,  0.00081661],
       [ 0.00327603,  0.04038655,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       ...,
       [-0.01179369, -0.13975128,  0.        , ...,  0.        ,
         0.        , -0.00840509],
       [ 0.00327603,  0.04038655,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       [-0.01179369,  0.12233785,  0.        , ...,  0.        ,
         0.        ,  0.00081661]])
In [104]:
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], dataset_aif_tranf_gender.feature_names)
Out[104]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [105]:
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[1,:], dataset_aif_tranf_gender.feature_names)
Out[105]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [106]:
shap.force_plot(explainer.expected_value, shap_values[:,:], X.iloc[:,:],feature_names = dataset_aif_tranf_gender.feature_names)
Out[106]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [107]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[0,:],feature_names=dataset_aif_tranf_gender.feature_names)

2.b.2) Using ELI5

In [108]:
perm_gender = PermutationImportance(model).fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
perm_imp=eli5.show_weights(perm_gender,feature_names = dataset_aif_tranf_gender.feature_names)
perm_imp
plt.show()
Out[108]:
Weight Feature
0.0147 ± 0.0065 NumMonths
0.0118 ± 0.0048 Age
0.0073 ± 0.0076 Purpose_CarNew
0.0057 ± 0.0065 Savings_<500
0 ± 0.0000 Savings_none
0 ± 0.0000 Dependents
0 ± 0.0000 Property_rent
0 ± 0.0000 Job_management/self-emp/officer/highly qualif emp
0 ± 0.0000 Debtors_guarantor
0 ± 0.0000 Purpose_others
0 ± 0.0000 Purpose_furniture/equip
0 ± 0.0000 Purpose_CarUsed
0 ± 0.0000 Collateral_unknown/none
0 ± 0.0000 PayBackPercent
0 ± 0.0000 Job_unemp/unskilled-non resident
0 ± 0.0000 CreditHistory_none/paid
0 ± 0.0000 Debtors_none
0 ± 0.0000 Collateral_real estate
0 ± 0.0000 Marital_Status
0 ± 0.0000 Purpose_education
… 3 more …
In [ ]:
 

2.c) with marital status as protected variable

In [109]:
mdl_marital = model.fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())

2.c.1) Model Explainability

2.c.1) Using SHAP

In [110]:
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.TreeExplainer(grid_search.best_estimator_,dataset_aif_tranf_marital.features)
shap_values=explainer.shap_values(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
#https://github.com/slundberg/shap
In [111]:
shap_values
Out[111]:
array([[ 0.02358738,  0.06138272,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       [ 0.00327603,  0.08784266,  0.        , ...,  0.        ,
         0.        ,  0.00081661],
       [ 0.00327603,  0.04038655,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       ...,
       [-0.01179369, -0.13975128,  0.        , ...,  0.        ,
         0.        , -0.00840509],
       [ 0.00327603,  0.04038655,  0.        , ...,  0.        ,
         0.        ,  0.00021752],
       [-0.01179369,  0.12233785,  0.        , ...,  0.        ,
         0.        ,  0.00081661]])
In [112]:
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], dataset_aif_tranf_marital.feature_names)
Out[112]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [113]:
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[1,:], dataset_aif_tranf_marital.feature_names)
Out[113]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [114]:
shap.force_plot(explainer.expected_value, shap_values[:,:], X.iloc[:,:],feature_names = dataset_aif_tranf_marital.feature_names)
Out[114]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [115]:
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[0,:],feature_names=dataset_aif_tranf_marital.feature_names)

2.c.2) Using ELI5

In [116]:
perm_marital = PermutationImportance(model).fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
perm_imp=eli5.show_weights(perm_marital,feature_names = dataset_aif_tranf_marital.feature_names)
perm_imp
plt.show()
Out[116]:
Weight Feature
0.0155 ± 0.0080 Age
0.0073 ± 0.0042 NumMonths
0.0049 ± 0.0055 Purpose_CarNew
0.0049 ± 0.0080 Savings_<500
0 ± 0.0000 Savings_none
0 ± 0.0000 Dependents
0 ± 0.0000 Property_rent
0 ± 0.0000 Job_management/self-emp/officer/highly qualif emp
0 ± 0.0000 Debtors_guarantor
0 ± 0.0000 Purpose_others
0 ± 0.0000 Purpose_furniture/equip
0 ± 0.0000 Purpose_CarUsed
0 ± 0.0000 Collateral_unknown/none
0 ± 0.0000 PayBackPercent
0 ± 0.0000 Job_unemp/unskilled-non resident
0 ± 0.0000 CreditHistory_none/paid
0 ± 0.0000 Debtors_none
0 ± 0.0000 Collateral_real estate
0 ± 0.0000 Marital_Status
0 ± 0.0000 Purpose_education
… 3 more …
In [117]:
#!pip install shapash
In [118]:
#Training Tabular Explainer
#import lime.lime_tabular
#explainer = lime.lime_tabular.LimeTabularExplainer(dataset_aif_tranf_gender.features,
#                                                   mode='classification',
#                                                   feature_names=dataset_aif_tranf_gender.feature_names,
#                                                   class_names=dataset_aif_tranf_gender.labels.ravel())
In [119]:
# Function features_check Extract feature names from Lime Output to be used by shapash
#def features_check(s):
#    for w in list(dataset_orig_test.feature_names):
#        if f' {w} ' in f' {s} ' :
#            feat = w
#    return feat
##%%time
# Compute local Lime Explanation for each row in Test Sample
#contrib_l=[]
#for ind in dataset_orig_test.subset(0:1000):
#    exp = explainer.explain_instance(dataset_orig_test.ind.values, rf.predict_proba, num_features=dataset_orig_test.shape[1])
#    contrib_l.append(dict([[features_check(elem[0]),elem[1]] for elem in exp.as_list()]))
In [120]:
#contribution_df =pd.DataFrame(contrib_l,index=dataset_aif_test.index)
# sorting the columns as in the original dataset
#contribution_df = contribution_df[list(dataset_aif_test.columns)]
In [121]:
#from shapash.explainer.smart_explainer import SmartExplainer
#xpl = SmartExplainer() # optional parameter

#xpl.compile(
#x=dataset_aif_tranf_gender.features,
#model=model,
#)
In [122]:
#app = xpl.run_app()
In [ ]: